Setup

library(tidyverse)
library(magrittr)
library(parallel)
library(here)
library(AnnotationHub)
library(purrr)
library(scales)
library(kableExtra)
library(edgeR)
library(ggrepel)
library(fgsea)
library(RColorBrewer)
library(pheatmap)
library(cqn)
library(ggpubr)
library(DT)
library(htmltools)
library(pander)
if (interactive()) setwd(here::here())
theme_set(theme_bw())
cores <- detectCores() - 1
source(here("R/lbFuncs.R"))

Annotations

ah <- AnnotationHub() %>%
  subset(species == "Danio rerio") %>%
  subset(rdataclass == "EnsDb")
ensDb <- ah[["AH83189"]] ## Ens101
transcripts <- transcripts(ensDb)
txLen <- exonsBy(ensDb, "tx") %>%
  width() %>%
  vapply(sum, integer(1))
mcols(transcripts) <- mcols(transcripts)[
  c("tx_id", "gene_id", "gc_content")
] %>%
  as.data.frame() %>%
  mutate(length = txLen)
geneGc <- transcripts %>%
  mcols() %>%
  as_tibble() %>%
  group_by(gene_id) %>%
  summarise(
    gc_content = sum(gc_content*length) / sum(length),
    length = ceiling(median(length))
  )
genes <- genes(ensDb)
mcols(genes) <- mcols(genes)[
  c("gene_id", "gene_name", "gene_biotype", "entrezid")
] %>%
  as.data.frame() %>%
  left_join(geneGc)

An EnsDb object was obtained for Ensembl release 101 with the AnnotationHub package. This contained the information required to set up gene and transcript annotations. Gene-level estimates of GC content and length were also generated which are required for assessing potential biases in downstream analysis.

Count data

metadata <- read_csv(here("files/samples.csv")) %>%
  dplyr::select(-sample) %>%
  dplyr::rename(sample = basename, genotype = Genotype) %>%
  ## We need some sample aliases that follow R naming conventions
  mutate(
    alias = c(
      paste0(rep("fAD", 7), seq(1, 7)),
      paste0(rep("fAI", 8), seq(1, 8)),
      paste0(rep("wt", 9), seq(1, 9))
    ),
    group = genotype
  )
metadata$genotype <- fct_relevel(
  metadata$genotype,
  c("WT", "EOfAD-like/+", "fAI-like/+")
)
genoCols <- metadata$genotype %>%
  levels() %>%
  length() %>%
  brewer.pal("Set1") %>%
  setNames(levels(metadata$genotype))
counts <- read_tsv(here("03_align/featureCounts/genes.out")) %>%
  set_colnames(basename(colnames(.))) %>%
  set_colnames(str_remove(colnames(.), "Aligned.sortedByCoord.out.bam"))
minLib <- counts %>%
  dplyr::select(., str_subset(colnames(.), "KB")) %>%
  colSums() %>%
  min()
minCPM <- as.vector(cpm(10, lib.size = minLib))
minSamples <- metadata %>%
  split(.$genotype) %>%
  lapply(nrow) %>%
  unlist() %>%
  min()
dgeList <- counts %>%
  as.data.frame() %>%
  column_to_rownames("Geneid") %>%
  .[rowSums(cpm(.) >= minCPM) >= minSamples,] %>%
  DGEList(
    samples = tibble(sample = colnames(.)) %>%
      left_join(metadata),
    genes = genes[rownames(.),] %>%
      as_tibble() %>%
      dplyr::select(chromosome = seqnames, everything())
  ) %>%
  calcNormFactors()

Gene-level count data was loaded from featureCounts and imported into R as a DGEList object. Genes were removed from the counts matrix if they had less than 0.22 CPM (~10 reads) in more than 17 samples. This meant that each remaining gene must have assigned reads detected in at least one genotype group.

21147 genes remained for further analysis with library sizes ranging from 46,298,106 to 71,535,868.

cpm(dgeList, log = TRUE) %>%
  as.data.frame() %>%
  pivot_longer(
    cols = everything(),
    names_to = "sample",
    values_to = "logCPM"
  ) %>%
  left_join(metadata) %>%
  ggplot(aes(logCPM, colour = genotype, group = sample)) +
  geom_density() +
  scale_colour_manual(values = genoCols) +
  labs(
    x = "logCPM",
    y = "Density",
    colour = "Genotype"
  )
*Expression density plots for all samples after filtering*

Expression density plots for all samples after filtering

Initial DE analysis

PCA

pca <- dgeList %>%
  cpm(log = TRUE) %>%
  t() %>%
  prcomp() 
pcaVars <- percent_format(0.1)(summary(pca)$importance["Proportion of Variance",])
pca$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(metadata) %>%
  as_tibble() %>%
  ggplot(aes(PC1, PC2, colour = genotype, fill = genotype, label = alias)) +
  geom_point(size = 2) +
  geom_text_repel(show.legend = FALSE) +
  stat_ellipse(geom = "polygon", alpha = 0.05, show.legend = FALSE) +
  guides(fill = FALSE) +
  scale_colour_manual(values = genoCols) +
  labs(
    x = paste0("PC1 (", pcaVars[["PC1"]], ")"),
    y = paste0("PC2 (", pcaVars[["PC2"]], ")"),
    colour = "Genotype"
  )
*Principal Component Analysis of gene-level counts*

Principal Component Analysis of gene-level counts

A Principal Component Analysis (PCA) was performed using the sample logCPM values. It appears there is no clustering of genotypes, which may be due to the subtlty of the mutations and early time point (6 months). Additionally, PC1 only accounts for 12.1% of the variance.

Design

The design matrix was defined containing an intercept representing baseline expression in wildtype samples. The remaining two columns represent the presence of either the EofAD or fAI mutant alleles.

design <- model.matrix(~genotype, data = dgeList$samples) %>%
  set_colnames(str_remove(colnames(.), "genotype"))
pheatmap(
  design, 
  cluster_cols = FALSE, 
  cluster_rows = FALSE, 
  color = c("white", "grey50"),
  annotation_row = dgeList$samples["genotype"],
  annotation_colors = list(genotype = genoCols),
  angle_col = "315",
  legend = FALSE
)
*Visualisation of the design matrix*

Visualisation of the design matrix

Model fitting

Genewise negative binomial generalized linear models were using the glmFit() function of the edgeR package. Likelihood ratio tests were then performed for the coefficients in the model with glmLRT(). Here the null hypothesis, \(H_0\), is testing that the true values of the coefficients are equal to 0. Due to the subtle effects of the mutation outlined in the PCA, genes were considered to be DE if they had an FDR-adjusted p-value < 0.05, as opposed to more stringent error rate controls such as the Bonferroni method.

alpha <- 0.05
minLfc <- 1
fit <- estimateDisp(dgeList, design) %>%
  glmFit()
topTables <- colnames(design)[2:3] %>%
  sapply(function(x){
    glmLRT(fit, coef = x) %>%
      topTags(n = Inf) %>%
      .[["table"]] %>%
      as_tibble() %>%
      arrange(PValue) %>%
      dplyr::select(
        gene_id, gene_name, logFC, logCPM, PValue, FDR, everything()  
      ) %>%
      mutate(
        coef = x,
        bonfP = p.adjust(PValue, "bonf"),
        DE = ifelse(FDR < 0.05, TRUE, FALSE)
      )
  }, simplify = FALSE)

With this criteria, the following genes were classified as DE:

topTables$`EOfAD-like/+` %>%
  dplyr::filter(DE) %>% 
  dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP) %>%
  mutate(
    logCPM = formatC(logCPM, digits = 2, format = "f"),
    logFC = formatC(logFC, digits = 2, format = "f"),
    PValue = formatC(PValue, digits = 2, format = "e"),
    FDR = formatC(FDR, digits = 2, format = "e"),
    bonfP = formatC(bonfP, digits = 2, format = "e"),
  ) %>%
  kable(
    align = "l",
    caption = paste(
      "The", nrow(.), "differentially expressed genes for EOfAD genotype",
      "in comparison to wildtype"
    )
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
The 10 differentially expressed genes for EOfAD genotype in comparison to wildtype
gene_id gene_name logFC logCPM PValue FDR bonfP
ENSDARG00000078322 col12a1a 1.87 3.83 6.26e-21 1.32e-16 1.32e-16
ENSDARG00000093378 si:ch211-235i11.5 0.63 2.96 3.36e-11 3.55e-07 7.10e-07
ENSDARG00000079324 ano9b 1.38 -1.19 3.16e-08 2.22e-04 6.67e-04
ENSDARG00000020448 adi1 0.54 2.81 3.15e-07 1.67e-03 6.67e-03
ENSDARG00000092191 CR318588.1 3.32 2.45 1.25e-06 4.99e-03 2.65e-02
ENSDARG00000001993 myhb 4.04 2.38 1.41e-06 4.99e-03 2.99e-02
ENSDARG00000088298 si:ch211-235i11.4 0.34 3.57 2.44e-06 7.36e-03 5.15e-02
ENSDARG00000094719 CR318588.3 2.05 2.71 4.64e-06 1.23e-02 9.81e-02
ENSDARG00000056065 mov10b.2 1.38 -1.77 1.68e-05 3.95e-02 3.56e-01
ENSDARG00000088507 znf982 0.39 2.28 2.31e-05 4.89e-02 4.89e-01
topTables$`fAI-like/+` %>%
  dplyr::filter(DE) %>% 
  dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP) %>%
  mutate(
    logCPM = formatC(logCPM, digits = 2, format = "f"),
    logFC = formatC(logFC, digits = 2, format = "f"),
    PValue = formatC(PValue, digits = 2, format = "e"),
    FDR = formatC(FDR, digits = 2, format = "e"),
    bonfP = formatC(bonfP, digits = 2, format = "e"),
  ) %>%
  kable(
    align = "l",
    caption = paste(
      "The", nrow(.), "differentially expressed genes for fAI genotype",
      "in comparison to wildtype"
    )
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
The 6 differentially expressed genes for fAI genotype in comparison to wildtype
gene_id gene_name logFC logCPM PValue FDR bonfP
ENSDARG00000004870 psen1 -0.78 5.33 3.84e-75 8.13e-71 8.13e-71
ENSDARG00000005179 dglucy -0.96 5.01 3.70e-09 3.91e-05 7.82e-05
ENSDARG00000061758 sh3pxd2ab -3.04 1.06 3.02e-08 2.13e-04 6.39e-04
ENSDARG00000067665 fam167aa -3.39 0.33 7.62e-08 4.03e-04 1.61e-03
ENSDARG00000105453 kcnk17 -1.52 -1.00 7.63e-07 3.23e-03 1.61e-02
ENSDARG00000094098 si:ch211-278p9.3 -2.29 -2.35 1.22e-05 4.28e-02 2.57e-01
deCols <- c(`TRUE` = "red", `FALSE` = "grey50")

Bias checks

MA plots

topTables %>%
  bind_rows() %>%
  arrange(DE) %>%
  ggplot(aes(logCPM, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.4) +
  geom_text_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% dplyr::filter(DE)
  ) +
  geom_smooth(se = FALSE) +
  facet_wrap(~coef, nrow = 2) +
  scale_y_continuous(breaks = seq(-8, 8, by = 2)) +
  scale_colour_manual(values = deCols) +
  theme(legend.position = "none")
*MA plots checking for logFC artefacts across the range of expression values. The average logFC appears consistant across all expression values for both comparisons*

MA plots checking for logFC artefacts across the range of expression values. The average logFC appears consistant across all expression values for both comparisons

GC content

topTables %>%
  bind_rows() %>%
  mutate(stat = -sign(logFC)*log10(PValue)) %>%
  ggplot(aes(gc_content, stat)) +
  geom_point(aes(colour = DE), alpha = 0.4) +
  geom_smooth(se = FALSE) +
  facet_wrap(~coef)  +
  geom_text_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% dplyr::filter(DE)
  ) +
  labs(
    x = " Gene GC content (%)",
    y = "Ranking Statistic"
  ) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_colour_manual(values = deCols) +
  theme(legend.position = "none")
*GC content bias for differential expression. GC content is shown against the ranking statistic, calculated by -log10(p) multiple by the sign of logFC. Some bias was observed for both comparisons. In particular, the fAI genotype shows a well-documented pattern of GC bias, whereby both low and high GC proportions are affected. This may be of concern for this dataset.*

GC content bias for differential expression. GC content is shown against the ranking statistic, calculated by -log10(p) multiple by the sign of logFC. Some bias was observed for both comparisons. In particular, the fAI genotype shows a well-documented pattern of GC bias, whereby both low and high GC proportions are affected. This may be of concern for this dataset.

Gene length

topTables %>%
  bind_rows() %>%
  mutate(stat = -sign(logFC)*log10(PValue)) %>%
  ggplot(aes(length, stat)) +
  geom_point(aes(colour = DE), alpha = 0.4) +
  geom_smooth(se = FALSE) +
  facet_wrap(~coef)  +
  geom_text_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% dplyr::filter(DE)
  ) +
  labs(
    x = "GC length",
    y = "Ranking Statistic"
  ) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_x_log10(labels = comma) +
  scale_colour_manual(values = deCols) +
  theme(legend.position = "none")
*Gene length bias for differential expression. Gene length is shown against the ranking statistic, calculated by -log10(p) multiple by the sign of logFC. No bias is observed for either comparison.*

Gene length bias for differential expression. Gene length is shown against the ranking statistic, calculated by -log10(p) multiple by the sign of logFC. No bias is observed for either comparison.

Conditional Quantile Normalisation

Conditional Quantile Normalisation (CQN) is a procedure that can account for GC content and length biases in differential expression and is implemented with package cqn. It does this by making by calculating offset values that can be incorporated into the GLM. As GC content was noted as being of concern, but length showed no bias, only GC content was used to calculate the offsets. The glm.offset values from the output of cqn were added to the original DGEList object.

cqn <- cqn(
  counts = dgeList$counts,
  x = dgeList$genes$gc_content,
  lengths = rep(1000, times = nrow(dgeList)),
  sizeFactors = dgeList$samples$lib.size,
  lengthMethod = "fixed"
)
cqnplot(cqn, n = 1, xlab = "GC Content", col = genoCols)
legend(
  "bottomright",
  legend = levels(metadata$genotype),
  col = genoCols,
  lty = 1
)
*Model fits for GC content under the CQN model. *

Model fits for GC content under the CQN model.

dgeList$offset <- cqn$glm.offset

PCA

cpm_cqn <- cqn %>%
  with(y + offset)
pca_cqn <- cpm_cqn %>%
  t() %>%
  prcomp() 
pcaVars <- percent_format(0.1)(summary(pca_cqn)$importance["Proportion of Variance",])
pca_cqn$x %>%
  as.data.frame() %>%
  rownames_to_column("sample") %>%
  left_join(metadata) %>%
  as_tibble() %>%
  ggplot(aes(PC1, PC2, colour = genotype, fill = genotype, label = alias)) +
  geom_point(size = 2) +
  geom_text_repel(show.legend = FALSE) +
  stat_ellipse(geom = "polygon", alpha = 0.05, show.legend = FALSE) +
  guides(fill = FALSE) +
  scale_colour_manual(values = genoCols) +
  labs(
    x = paste0("PC1 (", pcaVars[["PC1"]], ")"),
    y = paste0("PC2 (", pcaVars[["PC2"]], ")"),
    colour = "Genotype"
  )
*Principal Component Analysis of gene-level counts post CQN.*

Principal Component Analysis of gene-level counts post CQN.

There was very little improvement in separation of genotype groups after CQN, and PC1 still only accounted for 11.4% of the variation. The x-axis of the PCA plot appears to be flipped, but this normal as the signs of PC values are somewhat arbitrary.

Model fitting

After incorporation of the offset values into the DGEList object, dispersion estimates were calculated and models were fitted as done pre CQN.

fit_cqn <- estimateDisp(dgeList, design) %>%
  glmFit()
topTables_cqn <- colnames(design)[2:3] %>%
  sapply(function(x){
    glmLRT(fit_cqn, coef = x) %>%
      topTags(n = Inf) %>%
      .[["table"]] %>%
      as_tibble() %>%
      arrange(PValue) %>%
      dplyr::select(
        gene_id, gene_name, logFC, logCPM, PValue, FDR, everything()  
      ) %>%
      mutate(
        coef = x,
        bonfP = p.adjust(PValue, "bonf"),
        DE = ifelse(FDR < 0.05, TRUE, FALSE)
      )
  }, simplify = FALSE)
topTables_cqn$`EOfAD-like/+` %>%
  dplyr::filter(DE) %>% 
  dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP) %>%
  mutate(
    logCPM = formatC(logCPM, digits = 2, format = "f"),
    logFC = formatC(logFC, digits = 2, format = "f"),
    PValue = formatC(PValue, digits = 2, format = "e"),
    FDR = formatC(FDR, digits = 2, format = "e"),
    bonfP = formatC(bonfP, digits = 2, format = "e"),
  ) %>%
  kable(
    align = "l",
    caption = paste(
      "The", nrow(.), "differentially expressed genes for EOfAD genotype",
      "in comparison to wildtype after CQN"
    )
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
The 8 differentially expressed genes for EOfAD genotype in comparison to wildtype after CQN
gene_id gene_name logFC logCPM PValue FDR bonfP
ENSDARG00000078322 col12a1a 1.87 3.83 9.45e-21 2.00e-16 2.00e-16
ENSDARG00000093378 si:ch211-235i11.5 0.62 2.96 4.52e-11 4.78e-07 9.55e-07
ENSDARG00000020448 adi1 0.52 2.81 2.88e-09 2.03e-05 6.09e-05
ENSDARG00000079324 ano9b 1.35 -1.19 6.58e-09 3.48e-05 1.39e-04
ENSDARG00000001993 myhb 4.22 2.38 1.47e-06 6.23e-03 3.11e-02
ENSDARG00000092191 CR318588.1 3.21 2.45 2.25e-06 7.93e-03 4.76e-02
ENSDARG00000094719 CR318588.3 2.01 2.71 3.33e-06 1.01e-02 7.04e-02
ENSDARG00000088298 si:ch211-235i11.4 0.33 3.57 6.47e-06 1.71e-02 1.37e-01
topTables_cqn$`fAI-like/+` %>%
  dplyr::filter(DE) %>% 
  dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP) %>%
  mutate(
    logCPM = formatC(logCPM, digits = 2, format = "f"),
    logFC = formatC(logFC, digits = 2, format = "f"),
    PValue = formatC(PValue, digits = 2, format = "e"),
    FDR = formatC(FDR, digits = 2, format = "e"),
    bonfP = formatC(bonfP, digits = 2, format = "e"),
  ) %>%
  kable(
    align = "l",
    caption = paste(
      "The", nrow(.), "differentially expressed genes for fAI genotype",
      "in comparison to wildtype after CQN"
    )
  ) %>%
  kable_styling(
    bootstrap_options = c("striped", "hover", "condensed", "responsive")
  )
The 13 differentially expressed genes for fAI genotype in comparison to wildtype after CQN
gene_id gene_name logFC logCPM PValue FDR bonfP
ENSDARG00000004870 psen1 -0.81 5.33 8.89e-52 1.88e-47 1.88e-47
ENSDARG00000061758 sh3pxd2ab -3.30 1.06 2.20e-09 2.32e-05 4.65e-05
ENSDARG00000005179 dglucy -1.01 5.01 6.88e-09 4.85e-05 1.45e-04
ENSDARG00000067665 fam167aa -3.62 0.33 1.23e-08 6.52e-05 2.61e-04
ENSDARG00000105453 kcnk17 -1.66 -1.00 9.78e-08 4.14e-04 2.07e-03
ENSDARG00000101775 zgc:172014 -2.06 -0.43 1.49e-06 5.25e-03 3.15e-02
ENSDARG00000079992 senp6b 1.79 -0.96 3.21e-06 9.70e-03 6.79e-02
ENSDARG00000104031 mrpl33 -0.26 4.08 5.41e-06 1.43e-02 1.14e-01
ENSDARG00000053240 kif6 -0.89 0.78 1.11e-05 2.60e-02 2.34e-01
ENSDARG00000092522 nnt2 -0.77 0.29 1.36e-05 2.88e-02 2.88e-01
ENSDARG00000081758 NC_002333.12 -1.10 0.95 2.09e-05 3.78e-02 4.43e-01
ENSDARG00000037613 lgals8b 0.36 4.43 2.15e-05 3.78e-02 4.54e-01
ENSDARG00000100743 si:dkey-190j3.4 1.08 -0.52 2.84e-05 4.62e-02 6.01e-01

Bias checks

MA plots

topTables_cqn %>%
  bind_rows() %>%
  arrange(DE) %>%
  ggplot(aes(logCPM, logFC)) +
  geom_point(aes(colour = DE), alpha = 0.4) +
  geom_text_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% dplyr::filter(DE)
  ) +
  geom_smooth(se = FALSE) +
  facet_wrap(~coef, nrow = 2) +
  scale_y_continuous(breaks = seq(-8, 8, by = 2)) +
  scale_colour_manual(values = deCols) +
  theme(legend.position = "none")
*MA plots checking for logFC artefacts across the range of expression values. The average logFC remains consistant across all expression values for both comparisons after CQN*

MA plots checking for logFC artefacts across the range of expression values. The average logFC remains consistant across all expression values for both comparisons after CQN

GC content

topTables_cqn %>%
  bind_rows() %>%
  mutate(stat = -sign(logFC)*log10(PValue)) %>%
  ggplot(aes(gc_content, stat)) +
  geom_point(aes(colour = DE), alpha = 0.4) +
  geom_smooth(se = FALSE) +
  facet_wrap(~coef)  +
  geom_text_repel(
    aes(label = gene_name, colour = DE),
    data = . %>% dplyr::filter(DE)
  ) +
  labs(
    x = "Gene GC content (%)",
    y = "Ranking Statistic"
  ) +
  coord_cartesian(ylim = c(-10, 10)) +
  scale_colour_manual(values = deCols) +
  theme(legend.position = "none")
*GC content bias for differential expression. GC content is shown against the ranking statistic, calculated by -log10(p) multiple by the sign of logFC. There was still a small amount of residual bias post CQN, but an improvement was noted, particulary for the fAI genotype.*

GC content bias for differential expression. GC content is shown against the ranking statistic, calculated by -log10(p) multiple by the sign of logFC. There was still a small amount of residual bias post CQN, but an improvement was noted, particulary for the fAI genotype.

Results

topTables_cqn %>%
  bind_rows() %>%
  ggplot(aes(logFC, -log10(PValue), colour = DE)) +
  geom_point(alpha = 0.4) +
  geom_text_repel(
    aes(label = gene_name),
    size = 3,
    data = . %>% dplyr::filter(DE)
  ) +
  geom_text_repel(
    aes(label = gene_name),
    size = 3,
    data = . %>% dplyr::filter(!DE, logFC < -3)
  ) +
  facet_wrap(~coef, nrow = 1) +
  scale_colour_manual(values = deCols) +
  scale_x_continuous(breaks = seq(-8, 8, by = 2)) +
  coord_cartesian(ylim = c(0, 10)) +
  labs(y = log10plab) +
  theme(legend.position = "none") 
*Volcano plots showing -log10(p) against logFC. Genes determined to be DE are coloured red.*

Volcano plots showing -log10(p) against logFC. Genes determined to be DE are coloured red.

geneLabeller <- as_labeller(
  structure(dgeList$genes$gene_name, names = dgeList$genes$gene_id)
)
fadComp <- metadata %>%
  dplyr::filter(str_detect(alias, "(fAD|wt)")) %>%
  pull(sample)
faiComp <- metadata %>%
  dplyr::filter(str_detect(alias, "(fAI|wt)")) %>%
  pull(sample)
deGenes <- lapply(topTables_cqn, function(x){
  dplyr::filter(x, DE) %>%
    pull(gene_id)
})
deCpmPlots <- lapply(seq_along(deGenes), function(x){
  cpms <- cpm_cqn[deGenes[[x]],] %>% 
    as.data.frame() %>% 
    rownames_to_column("gene_id") %>% 
    gather(key = "sample", value = logCPM, -gene_id) %>% 
    as_tibble() %>%
    left_join(metadata[,c("sample", "genotype")])
  if (names(deGenes[x]) == "EOfAD-like/+") {
    dplyr::filter(cpms, sample %in% fadComp) %>%
      ggplot(aes(genotype, logCPM)) +
      geom_boxplot(aes(fill = genotype)) +
      labs(title = paste(names(deGenes[x]), "DE genes")) +
      facet_wrap(
        ~gene_id,
        labeller = geneLabeller,
        nrow = 2
      ) +
      theme(
        legend.position = "none",
        title = element_text(face = "bold")
      )
  } else if (names(deGenes[x]) == "fAI-like/+") {
    dplyr::filter(cpms, sample %in% faiComp) %>%
      ggplot(aes(genotype, logCPM)) +
      geom_boxplot(aes(fill = genotype)) +
      labs(title = paste(names(deGenes[x]), "DE genes")) +
      facet_wrap(
        ~gene_id,
        labeller = geneLabeller,
        nrow = 4
      ) +
      theme(
        legend.position = "none",
        title = element_text(face = "bold")
      )
  }
})

EOfAD-like mutation

A set of 8 were classified as DE due to the EOfAD-like/+ genotype. The highest ranked genes by p-value are shown in the following table. Interestingly, 4 of these genes were located on chromosome 17, the same chromosome as the mutation.

topTables_cqn$`EOfAD-like/+` %>%
  dplyr::slice(1:1000) %>%
  dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP, DE) %>%
  mutate(
    logCPM = formatC(logCPM, digits = 2, format = "f"),
    logFC = formatC(logFC, digits = 2, format = "f"),
    PValue = formatC(PValue, digits = 2, format = "e"),
    FDR = formatC(FDR, digits = 2, format = "e"),
    bonfP = formatC(bonfP, digits = 2, format = "e"),
  ) %>%
  datatable(
    caption = htmltools::tags$caption(htmltools::em(paste0(
      "EOfAD-like/+ genotype: The top 1000 genes ranked by p-value from ",
      "differential expression testing."
    ))),
    filter = "top"
  )
cpm_cqn[deGenes$`EOfAD-like/+`,] %>%
  as.data.frame() %>%
  dplyr::select(all_of(fadComp)) %>%
  set_rownames(idToSym(rownames(.), genes)) %>%
  pheatmap::pheatmap(
    color = viridis_pal(option = "magma")(100),
    legend_breaks = c(seq(-2, 8, by = 2), max(.)),
    legend_labels = c(seq(-2, 8, by = 2), "logCPM\n"),
    annotation_col = dgeList$samples %>%
      dplyr::select(Genotype = genotype),
    annotation_names_col = FALSE,
    annotation_colors = list(Genotype = genoCols),
    cutree_rows = 3,
    angle_col = "315"
  )
*Heatmap of DE genes in the comparison between EOfAD-like/+ and WT genotypes. Values a logCPM after CQN.*

Heatmap of DE genes in the comparison between EOfAD-like/+ and WT genotypes. Values a logCPM after CQN.

deCpmPlots[[1]]
*logCPM boxplots between comparative groups for genes classified as DE.*

logCPM boxplots between comparative groups for genes classified as DE.

fAI-like mutation

A set of 13 were classified as DE due to the fAI-like/+ genotype. The highest ranked genes by p-value are shown in the following table. The location of DE genes followed a similar trend to the Alzheimer’s mutant with 9 genes located on chromosome 17.

topTables_cqn$`fAI-like/+` %>%
  dplyr::slice(1:1000) %>%
  dplyr::select(gene_id, gene_name, logFC, logCPM, PValue, FDR, bonfP, DE) %>%
  mutate(
    logCPM = formatC(logCPM, digits = 2, format = "f"),
    logFC = formatC(logFC, digits = 2, format = "f"),
    PValue = formatC(PValue, digits = 2, format = "e"),
    FDR = formatC(FDR, digits = 2, format = "e"),
    bonfP = formatC(bonfP, digits = 2, format = "e"),
  ) %>%
  datatable(
    caption = htmltools::tags$caption(htmltools::em(paste0(
      "fAI-like/+ genotype: The top 1000 genes ranked by p-value from ",
      "differential expression testing."
    ))),
    filter = "top"
  )
cpm_cqn[deGenes$`fAI-like/+`,] %>%
  as.data.frame() %>%
  dplyr::select(all_of(faiComp)) %>%
  set_rownames(idToSym(rownames(.), genes)) %>%
  pheatmap::pheatmap(
    color = viridis_pal(option = "magma")(100),
    legend_breaks = c(seq(-2, 8, by = 2), max(.)),
    legend_labels = c(seq(-2, 8, by = 2), "logCPM\n"),
    annotation_col = dgeList$samples %>%
      dplyr::select(Genotype = genotype),
    annotation_names_col = FALSE,
    annotation_colors = list(Genotype = genoCols),
    cutree_rows = 2,
    angle_col = "315",
  )
*Heatmap of DE genes in the comparison between fAI-like/+ and WT genotypes. Values a logCPM after CQN.*

Heatmap of DE genes in the comparison between fAI-like/+ and WT genotypes. Values a logCPM after CQN.

deCpmPlots[[2]]
*logCPM boxplots between comparative groups for genes classified as DE.*

logCPM boxplots between comparative groups for genes classified as DE.

Data export

Final results were exported as Rds files for further analysis.

saveRDS(topTables, here("files/topTables.Rds"))
saveRDS(topTables_cqn, here("files/topTables_cqn.Rds"))
sessionInfo() %>%
  pander()

R version 4.1.0 (2021-05-18)

Platform: x86_64-pc-linux-gnu (64-bit)

locale: LC_CTYPE=en_AU.UTF-8, LC_NUMERIC=C, LC_TIME=en_AU.UTF-8, LC_COLLATE=en_AU.UTF-8, LC_MONETARY=en_AU.UTF-8, LC_MESSAGES=en_AU.UTF-8, LC_PAPER=en_AU.UTF-8, LC_NAME=C, LC_ADDRESS=C, LC_TELEPHONE=C, LC_MEASUREMENT=en_AU.UTF-8 and LC_IDENTIFICATION=C

attached base packages: stats4, splines, parallel, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: ensembldb(v.2.16.0), AnnotationFilter(v.1.16.0), GenomicFeatures(v.1.44.0), AnnotationDbi(v.1.54.0), Biobase(v.2.52.0), GenomicRanges(v.1.44.0), GenomeInfoDb(v.1.28.0), IRanges(v.2.26.0), S4Vectors(v.0.30.0), pander(v.0.6.3), htmltools(v.0.5.1.1), DT(v.0.18), ggpubr(v.0.4.0), cqn(v.1.38.0), quantreg(v.5.85), SparseM(v.1.81), preprocessCore(v.1.54.0), nor1mix(v.1.3-0), mclust(v.5.4.7), pheatmap(v.1.0.12), RColorBrewer(v.1.1-2), fgsea(v.1.18.0), ggrepel(v.0.9.1), edgeR(v.3.34.0), limma(v.3.48.0), kableExtra(v.1.3.4), scales(v.1.1.1), AnnotationHub(v.3.0.0), BiocFileCache(v.2.0.0), dbplyr(v.2.1.1), BiocGenerics(v.0.38.0), here(v.1.0.1), magrittr(v.2.0.1), forcats(v.0.5.1), stringr(v.1.4.0), dplyr(v.1.0.6), purrr(v.0.3.4), readr(v.1.4.0), tidyr(v.1.1.3), tibble(v.3.1.2), ggplot2(v.3.3.3) and tidyverse(v.1.3.1)

loaded via a namespace (and not attached): utf8(v.1.2.1), tidyselect(v.1.1.1), RSQLite(v.2.2.7), htmlwidgets(v.1.5.3), grid(v.4.1.0), BiocParallel(v.1.26.0), munsell(v.0.5.0), withr(v.2.4.2), colorspace(v.2.0-1), filelock(v.1.0.2), highr(v.0.9), knitr(v.1.33), rstudioapi(v.0.13), ggsignif(v.0.6.1), MatrixGenerics(v.1.4.0), labeling(v.0.4.2), GenomeInfoDbData(v.1.2.6), bit64(v.4.0.5), farver(v.2.1.0), rprojroot(v.2.0.2), vctrs(v.0.3.8), generics(v.0.1.0), xfun(v.0.23), R6(v.2.5.0), locfit(v.1.5-9.4), bitops(v.1.0-7), cachem(v.1.0.5), DelayedArray(v.0.18.0), assertthat(v.0.2.1), promises(v.1.2.0.1), BiocIO(v.1.2.0), gtable(v.0.3.0), conquer(v.1.0.2), rlang(v.0.4.11), MatrixModels(v.0.5-0), systemfonts(v.1.0.2), rtracklayer(v.1.52.0), rstatix(v.0.7.0), lazyeval(v.0.2.2), broom(v.0.7.6), BiocManager(v.1.30.15), yaml(v.2.2.1), abind(v.1.4-5), modelr(v.0.1.8), crosstalk(v.1.1.1), backports(v.1.2.1), httpuv(v.1.6.1), tools(v.4.1.0), ellipsis(v.0.3.2), jquerylib(v.0.1.4), Rcpp(v.1.0.6), progress(v.1.2.2), zlibbioc(v.1.38.0), RCurl(v.1.98-1.3), prettyunits(v.1.1.1), SummarizedExperiment(v.1.22.0), haven(v.2.4.1), fs(v.1.5.0), data.table(v.1.14.0), openxlsx(v.4.2.3), reprex(v.2.0.0), ProtGenerics(v.1.24.0), matrixStats(v.0.58.0), hms(v.1.1.0), mime(v.0.10), evaluate(v.0.14), xtable(v.1.8-4), XML(v.3.99-0.6), rio(v.0.5.26), readxl(v.1.3.1), gridExtra(v.2.3), compiler(v.4.1.0), biomaRt(v.2.48.0), crayon(v.1.4.1), mgcv(v.1.8-36), later(v.1.2.0), lubridate(v.1.7.10), DBI(v.1.1.1), MASS(v.7.3-54), rappdirs(v.0.3.3), Matrix(v.1.3-4), car(v.3.0-10), cli(v.2.5.0), pkgconfig(v.2.0.3), GenomicAlignments(v.1.28.0), foreign(v.0.8-81), xml2(v.1.3.2), svglite(v.2.0.0), bslib(v.0.2.5.1), webshot(v.0.5.2), XVector(v.0.32.0), rvest(v.1.0.0), digest(v.0.6.27), Biostrings(v.2.60.0), rmarkdown(v.2.8), cellranger(v.1.1.0), fastmatch(v.1.1-0), restfulr(v.0.0.13), curl(v.4.3.1), shiny(v.1.6.0), Rsamtools(v.2.8.0), rjson(v.0.2.20), lifecycle(v.1.0.0), nlme(v.3.1-152), jsonlite(v.1.7.2), carData(v.3.0-4), viridisLite(v.0.4.0), fansi(v.0.5.0), pillar(v.1.6.1), lattice(v.0.20-44), KEGGREST(v.1.32.0), fastmap(v.1.1.0), httr(v.1.4.2), interactiveDisplayBase(v.1.30.0), glue(v.1.4.2), zip(v.2.2.0), png(v.0.1-7), BiocVersion(v.3.13.1), bit(v.4.0.4), stringi(v.1.6.2), sass(v.0.4.0), blob(v.1.2.1) and memoise(v.2.0.0)